Importing libraries and the data frame¶
In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
In [2]:
df = pd.read_csv('Train_Data.csv')
df.shape
Out[2]:
(86845, 43)
In [3]:
df.head()
Out[3]:
duration | protocoltype | service | flag | srcbytes | dstbytes | land | wrongfragment | urgent | hot | ... | dsthostsamesrvrate | dsthostdiffsrvrate | dsthostsamesrcportrate | dsthostsrvdiffhostrate | dsthostserrorrate | dsthostsrvserrorrate | dsthostrerrorrate | dsthostsrvrerrorrate | lastflag | attack | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | tcp | netbios_dgm | REJ | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0.06 | 0.06 | 0.00 | 0.00 | 0.00 | 0.0 | 1.00 | 1.0 | 21 | 1 |
1 | 0 | tcp | smtp | SF | 1239 | 400 | 0 | 0 | 0 | 0 | ... | 0.45 | 0.04 | 0.00 | 0.00 | 0.11 | 0.0 | 0.02 | 0.0 | 18 | 0 |
2 | 0 | tcp | http | SF | 222 | 945 | 0 | 0 | 0 | 0 | ... | 1.00 | 0.00 | 0.02 | 0.03 | 0.00 | 0.0 | 0.00 | 0.0 | 21 | 0 |
3 | 0 | tcp | http | SF | 235 | 1380 | 0 | 0 | 0 | 0 | ... | 1.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.0 | 0.00 | 0.0 | 21 | 0 |
4 | 0 | tcp | uucp_path | REJ | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0.01 | 0.08 | 0.00 | 0.00 | 0.00 | 0.0 | 1.00 | 1.0 | 19 | 1 |
5 rows × 43 columns
In [4]:
#ENCODING CATEGORICAL VALUES
df = pd.get_dummies(df, columns=["protocoltype"])
df.head()
Out[4]:
duration | service | flag | srcbytes | dstbytes | land | wrongfragment | urgent | hot | numfailedlogins | ... | dsthostsrvdiffhostrate | dsthostserrorrate | dsthostsrvserrorrate | dsthostrerrorrate | dsthostsrvrerrorrate | lastflag | attack | protocoltype_icmp | protocoltype_tcp | protocoltype_udp | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | netbios_dgm | REJ | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0.00 | 0.00 | 0.0 | 1.00 | 1.0 | 21 | 1 | False | True | False |
1 | 0 | smtp | SF | 1239 | 400 | 0 | 0 | 0 | 0 | 0 | ... | 0.00 | 0.11 | 0.0 | 0.02 | 0.0 | 18 | 0 | False | True | False |
2 | 0 | http | SF | 222 | 945 | 0 | 0 | 0 | 0 | 0 | ... | 0.03 | 0.00 | 0.0 | 0.00 | 0.0 | 21 | 0 | False | True | False |
3 | 0 | http | SF | 235 | 1380 | 0 | 0 | 0 | 0 | 0 | ... | 0.00 | 0.00 | 0.0 | 0.00 | 0.0 | 21 | 0 | False | True | False |
4 | 0 | uucp_path | REJ | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0.00 | 0.00 | 0.0 | 1.00 | 1.0 | 19 | 1 | False | True | False |
5 rows × 45 columns
In [5]:
df['protocoltype_icmp'] = df['protocoltype_icmp'].astype(int)
df['protocoltype_tcp'] = df['protocoltype_tcp'].astype(int)
df['protocoltype_udp'] = df['protocoltype_udp'].astype(int)
df.head()
Out[5]:
duration | service | flag | srcbytes | dstbytes | land | wrongfragment | urgent | hot | numfailedlogins | ... | dsthostsrvdiffhostrate | dsthostserrorrate | dsthostsrvserrorrate | dsthostrerrorrate | dsthostsrvrerrorrate | lastflag | attack | protocoltype_icmp | protocoltype_tcp | protocoltype_udp | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | netbios_dgm | REJ | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0.00 | 0.00 | 0.0 | 1.00 | 1.0 | 21 | 1 | 0 | 1 | 0 |
1 | 0 | smtp | SF | 1239 | 400 | 0 | 0 | 0 | 0 | 0 | ... | 0.00 | 0.11 | 0.0 | 0.02 | 0.0 | 18 | 0 | 0 | 1 | 0 |
2 | 0 | http | SF | 222 | 945 | 0 | 0 | 0 | 0 | 0 | ... | 0.03 | 0.00 | 0.0 | 0.00 | 0.0 | 21 | 0 | 0 | 1 | 0 |
3 | 0 | http | SF | 235 | 1380 | 0 | 0 | 0 | 0 | 0 | ... | 0.00 | 0.00 | 0.0 | 0.00 | 0.0 | 21 | 0 | 0 | 1 | 0 |
4 | 0 | uucp_path | REJ | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0.00 | 0.00 | 0.0 | 1.00 | 1.0 | 19 | 1 | 0 | 1 | 0 |
5 rows × 45 columns
In [6]:
service_map=df['service'].value_counts().to_dict()
In [7]:
service_map
Out[7]:
{'http': 31505, 'private': 12859, 'domain_u': 7196, 'smtp': 5806, 'ftp_data': 4929, 'other': 2131, 'telnet': 1780, 'finger': 1341, 'ftp': 1134, 'auth': 758, 'Z39_50': 678, 'uucp': 626, 'courier': 583, 'uucp_path': 556, 'bgp': 554, 'whois': 544, 'iso_tsap': 534, 'time': 516, 'nnsp': 511, 'imap4': 494, 'vmnet': 492, 'urp_i': 477, 'domain': 445, 'supdup': 428, 'csnet_ns': 423, 'discard': 423, 'ctf': 422, 'http_443': 415, 'eco_i': 401, 'daytime': 389, 'exec': 381, 'gopher': 376, 'efs': 374, 'systat': 361, 'name': 353, 'link': 350, 'hostnames': 349, 'login': 346, 'klogin': 345, 'mtp': 335, 'echo': 334, 'ldap': 315, 'netbios_dgm': 310, 'sunrpc': 303, 'netbios_ssn': 289, 'netstat': 282, 'netbios_ns': 268, 'ssh': 235, 'kshell': 232, 'nntp': 227, 'pop_3': 195, 'sql_net': 194, 'ecr_i': 143, 'IRC': 139, 'ntp_u': 134, 'rje': 60, 'X11': 52, 'shell': 49, 'remote_job': 49, 'pop_2': 47, 'printer': 46, 'urh_i': 9, 'red_i': 6, 'tim_i': 4, 'tftp_u': 3}
In [8]:
df['service']=df['service'].map(service_map)
df.head()
Out[8]:
duration | service | flag | srcbytes | dstbytes | land | wrongfragment | urgent | hot | numfailedlogins | ... | dsthostsrvdiffhostrate | dsthostserrorrate | dsthostsrvserrorrate | dsthostrerrorrate | dsthostsrvrerrorrate | lastflag | attack | protocoltype_icmp | protocoltype_tcp | protocoltype_udp | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 310 | REJ | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0.00 | 0.00 | 0.0 | 1.00 | 1.0 | 21 | 1 | 0 | 1 | 0 |
1 | 0 | 5806 | SF | 1239 | 400 | 0 | 0 | 0 | 0 | 0 | ... | 0.00 | 0.11 | 0.0 | 0.02 | 0.0 | 18 | 0 | 0 | 1 | 0 |
2 | 0 | 31505 | SF | 222 | 945 | 0 | 0 | 0 | 0 | 0 | ... | 0.03 | 0.00 | 0.0 | 0.00 | 0.0 | 21 | 0 | 0 | 1 | 0 |
3 | 0 | 31505 | SF | 235 | 1380 | 0 | 0 | 0 | 0 | 0 | ... | 0.00 | 0.00 | 0.0 | 0.00 | 0.0 | 21 | 0 | 0 | 1 | 0 |
4 | 0 | 556 | REJ | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0.00 | 0.00 | 0.0 | 1.00 | 1.0 | 19 | 1 | 0 | 1 | 0 |
5 rows × 45 columns
In [9]:
flag_map = df['flag'].value_counts().to_dict()
flag_map
Out[9]:
{'SF': 50672, 'S0': 27773, 'REJ': 6721, 'RSTO': 1127, 'S1': 294, 'RSTR': 106, 'S2': 103, 'S3': 38, 'OTH': 10, 'SH': 1}
In [10]:
df['flag'] = df['flag'].map(flag_map)
df.head()
Out[10]:
duration | service | flag | srcbytes | dstbytes | land | wrongfragment | urgent | hot | numfailedlogins | ... | dsthostsrvdiffhostrate | dsthostserrorrate | dsthostsrvserrorrate | dsthostrerrorrate | dsthostsrvrerrorrate | lastflag | attack | protocoltype_icmp | protocoltype_tcp | protocoltype_udp | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 310 | 6721 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0.00 | 0.00 | 0.0 | 1.00 | 1.0 | 21 | 1 | 0 | 1 | 0 |
1 | 0 | 5806 | 50672 | 1239 | 400 | 0 | 0 | 0 | 0 | 0 | ... | 0.00 | 0.11 | 0.0 | 0.02 | 0.0 | 18 | 0 | 0 | 1 | 0 |
2 | 0 | 31505 | 50672 | 222 | 945 | 0 | 0 | 0 | 0 | 0 | ... | 0.03 | 0.00 | 0.0 | 0.00 | 0.0 | 21 | 0 | 0 | 1 | 0 |
3 | 0 | 31505 | 50672 | 235 | 1380 | 0 | 0 | 0 | 0 | 0 | ... | 0.00 | 0.00 | 0.0 | 0.00 | 0.0 | 21 | 0 | 0 | 1 | 0 |
4 | 0 | 556 | 6721 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0.00 | 0.00 | 0.0 | 1.00 | 1.0 | 19 | 1 | 0 | 1 | 0 |
5 rows × 45 columns
In [11]:
df.dtypes
Out[11]:
duration int64 service int64 flag int64 srcbytes int64 dstbytes int64 land int64 wrongfragment int64 urgent int64 hot int64 numfailedlogins int64 loggedin int64 numcompromised int64 rootshell int64 suattempted int64 numroot int64 numfilecreations int64 numshells int64 numaccessfiles int64 numoutboundcmds int64 ishostlogin int64 isguestlogin int64 count int64 srvcount int64 serrorrate float64 srvserrorrate float64 rerrorrate float64 srvrerrorrate float64 samesrvrate float64 diffsrvrate float64 srvdiffhostrate float64 dsthostcount int64 dsthostsrvcount int64 dsthostsamesrvrate float64 dsthostdiffsrvrate float64 dsthostsamesrcportrate float64 dsthostsrvdiffhostrate float64 dsthostserrorrate float64 dsthostsrvserrorrate float64 dsthostrerrorrate float64 dsthostsrvrerrorrate float64 lastflag int64 attack int64 protocoltype_icmp int32 protocoltype_tcp int32 protocoltype_udp int32 dtype: object
In [12]:
df.isnull().sum()
Out[12]:
duration 0 service 0 flag 0 srcbytes 0 dstbytes 0 land 0 wrongfragment 0 urgent 0 hot 0 numfailedlogins 0 loggedin 0 numcompromised 0 rootshell 0 suattempted 0 numroot 0 numfilecreations 0 numshells 0 numaccessfiles 0 numoutboundcmds 0 ishostlogin 0 isguestlogin 0 count 0 srvcount 0 serrorrate 0 srvserrorrate 0 rerrorrate 0 srvrerrorrate 0 samesrvrate 0 diffsrvrate 0 srvdiffhostrate 0 dsthostcount 0 dsthostsrvcount 0 dsthostsamesrvrate 0 dsthostdiffsrvrate 0 dsthostsamesrcportrate 0 dsthostsrvdiffhostrate 0 dsthostserrorrate 0 dsthostsrvserrorrate 0 dsthostrerrorrate 0 dsthostsrvrerrorrate 0 lastflag 0 attack 0 protocoltype_icmp 0 protocoltype_tcp 0 protocoltype_udp 0 dtype: int64
Preparing the X and y labels¶
In [13]:
X = df.drop('attack', axis=1)
X.head()
Out[13]:
duration | service | flag | srcbytes | dstbytes | land | wrongfragment | urgent | hot | numfailedlogins | ... | dsthostsamesrcportrate | dsthostsrvdiffhostrate | dsthostserrorrate | dsthostsrvserrorrate | dsthostrerrorrate | dsthostsrvrerrorrate | lastflag | protocoltype_icmp | protocoltype_tcp | protocoltype_udp | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 310 | 6721 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0.00 | 0.00 | 0.00 | 0.0 | 1.00 | 1.0 | 21 | 0 | 1 | 0 |
1 | 0 | 5806 | 50672 | 1239 | 400 | 0 | 0 | 0 | 0 | 0 | ... | 0.00 | 0.00 | 0.11 | 0.0 | 0.02 | 0.0 | 18 | 0 | 1 | 0 |
2 | 0 | 31505 | 50672 | 222 | 945 | 0 | 0 | 0 | 0 | 0 | ... | 0.02 | 0.03 | 0.00 | 0.0 | 0.00 | 0.0 | 21 | 0 | 1 | 0 |
3 | 0 | 31505 | 50672 | 235 | 1380 | 0 | 0 | 0 | 0 | 0 | ... | 0.00 | 0.00 | 0.00 | 0.0 | 0.00 | 0.0 | 21 | 0 | 1 | 0 |
4 | 0 | 556 | 6721 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0.00 | 0.00 | 0.00 | 0.0 | 1.00 | 1.0 | 19 | 0 | 1 | 0 |
5 rows × 44 columns
In [14]:
X.shape
Out[14]:
(86845, 44)
In [15]:
y = df['attack']
y.head()
Out[15]:
0 1 1 0 2 0 3 0 4 1 Name: attack, dtype: int64
Train and Test split¶
In [16]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)
In [17]:
#SCALING THE DATA
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X_train = pd.DataFrame(scaler.fit_transform(X_train), columns = X.columns)
X_train.head()
Out[17]:
duration | service | flag | srcbytes | dstbytes | land | wrongfragment | urgent | hot | numfailedlogins | ... | dsthostsamesrcportrate | dsthostsrvdiffhostrate | dsthostserrorrate | dsthostsrvserrorrate | dsthostrerrorrate | dsthostsrvrerrorrate | lastflag | protocoltype_icmp | protocoltype_tcp | protocoltype_udp | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0.000000 | 0.184211 | 1.000000 | 0.000013 | 0.000047 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.01 | 0.02 | 0.01 | 0.01 | 0.0 | 0.0 | 1.00 | 0.0 | 1.0 | 0.0 |
1 | 0.000000 | 0.021427 | 0.548085 | 0.000000 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.00 | 0.00 | 1.00 | 1.00 | 0.0 | 0.0 | 0.95 | 0.0 | 1.0 | 0.0 |
2 | 0.000000 | 0.013332 | 0.548085 | 0.000000 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.00 | 0.00 | 1.00 | 1.00 | 0.0 | 0.0 | 0.95 | 0.0 | 1.0 | 0.0 |
3 | 0.000000 | 1.000000 | 1.000000 | 0.000002 | 0.002730 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.00 | 0.00 | 0.00 | 0.00 | 0.0 | 0.0 | 1.00 | 0.0 | 1.0 | 0.0 |
4 | 0.124092 | 0.067551 | 1.000000 | 0.000002 | 0.000015 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.85 | 0.00 | 0.00 | 0.00 | 0.0 | 0.0 | 1.00 | 0.0 | 0.0 | 1.0 |
5 rows × 44 columns
In [18]:
X_test = pd.DataFrame(scaler.transform(X_test), columns = X.columns)
X_test.head()
Out[18]:
duration | service | flag | srcbytes | dstbytes | land | wrongfragment | urgent | hot | numfailedlogins | ... | dsthostsamesrcportrate | dsthostsrvdiffhostrate | dsthostserrorrate | dsthostsrvserrorrate | dsthostrerrorrate | dsthostsrvrerrorrate | lastflag | protocoltype_icmp | protocoltype_tcp | protocoltype_udp | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0.0 | 0.156371 | 1.000000 | 7.994953e-05 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 1.00 | 0.02 | 0.0 | 0.0 | 0.0 | 0.0 | 0.95 | 0.0 | 1.0 | 0.0 |
1 | 0.0 | 0.228335 | 1.000000 | 4.911727e-07 | 0.000018 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.00 | 0.00 | 0.0 | 0.0 | 0.0 | 0.0 | 0.90 | 0.0 | 0.0 | 1.0 |
2 | 0.0 | 1.000000 | 1.000000 | 2.913547e-06 | 0.000119 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.04 | 0.04 | 0.0 | 0.0 | 0.0 | 0.0 | 1.00 | 0.0 | 1.0 | 0.0 |
3 | 0.0 | 0.408101 | 0.548085 | 0.000000e+00 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.01 | 0.00 | 1.0 | 1.0 | 0.0 | 0.0 | 0.85 | 0.0 | 1.0 | 0.0 |
4 | 0.0 | 0.009745 | 0.548085 | 0.000000e+00 | 0.000000 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.00 | 0.00 | 1.0 | 1.0 | 0.0 | 0.0 | 0.90 | 0.0 | 1.0 | 0.0 |
5 rows × 44 columns
LOGISTIC REGRESSION¶
In [19]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression()
lr.fit(X_train, y_train)
Out[19]:
LogisticRegression()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
LogisticRegression()
In [20]:
lr_pred = lr.predict(X_test)
In [22]:
from sklearn.metrics import f1_score
s1 = f1_score(y_test, lr_pred)
s1
Out[22]:
0.9996324880558618
SVM¶
In [23]:
from sklearn.svm import SVC
In [24]:
svc = SVC()
svc.fit(X_test, y_test)
Out[24]:
SVC()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
SVC()
In [25]:
svc_pred = svc.predict(X_test)
In [26]:
from sklearn.metrics import f1_score
s2 = f1_score(y_test, svc_pred)
s2
Out[26]:
0.9999540631172769
KNN¶
In [27]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)
#Euclidean Distance
knn.fit(X_train, y_train)
Out[27]:
KNeighborsClassifier()In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
KNeighborsClassifier()
In [28]:
knn_pred = knn.predict(X_test)
In [29]:
from sklearn.metrics import f1_score
s3 = f1_score(y_test, knn_pred)
s3
Out[29]:
0.9999081304547541
RANDOM FOREST¶
In [30]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators=50)
rf.fit(X_train, y_train)
#Change the no. of estimators and see the score
Out[30]:
RandomForestClassifier(n_estimators=50)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
RandomForestClassifier(n_estimators=50)
In [31]:
rf_pred = rf.predict(X_test)
s3 = f1_score(y_test, rf_pred)
s3
Out[31]:
0.9998621766894842
NN¶
In [32]:
import keras
In [33]:
#import tensorflow
from keras.models import Sequential
from keras.layers import Dense
from keras.utils import to_categorical
In [34]:
y_train = to_categorical(y_train)
y_test = to_categorical(y_test)
In [35]:
model = Sequential()
model.add(Dense(500, input_dim=44, activation='relu'))
model.add(Dense(100, activation='relu'))
model.add(Dense(50, activation='relu'))
model.add(Dense(2, activation='softmax'))
C:\Users\HP\anaconda3\Lib\site-packages\keras\src\layers\core\dense.py:87: UserWarning: Do not pass an `input_shape`/`input_dim` argument to a layer. When using Sequential models, prefer using an `Input(shape)` object as the first layer in the model instead. super().__init__(activity_regularizer=activity_regularizer, **kwargs)
In [36]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
In [37]:
model.fit(X_train, y_train, epochs=20)
Epoch 1/20 1819/1819 ━━━━━━━━━━━━━━━━━━━━ 4s 2ms/step - accuracy: 0.9975 - loss: 0.0205 Epoch 2/20 1819/1819 ━━━━━━━━━━━━━━━━━━━━ 3s 2ms/step - accuracy: 0.9996 - loss: 8.9060e-04 Epoch 3/20 1819/1819 ━━━━━━━━━━━━━━━━━━━━ 3s 2ms/step - accuracy: 1.0000 - loss: 8.9667e-06 Epoch 4/20 1819/1819 ━━━━━━━━━━━━━━━━━━━━ 4s 2ms/step - accuracy: 1.0000 - loss: 4.1097e-07 Epoch 5/20 1819/1819 ━━━━━━━━━━━━━━━━━━━━ 4s 2ms/step - accuracy: 1.0000 - loss: 7.6159e-08 Epoch 6/20 1819/1819 ━━━━━━━━━━━━━━━━━━━━ 4s 2ms/step - accuracy: 1.0000 - loss: 2.5925e-08 Epoch 7/20 1819/1819 ━━━━━━━━━━━━━━━━━━━━ 3s 2ms/step - accuracy: 1.0000 - loss: 9.0305e-09 Epoch 8/20 1819/1819 ━━━━━━━━━━━━━━━━━━━━ 3s 2ms/step - accuracy: 1.0000 - loss: 2.6525e-09 Epoch 9/20 1819/1819 ━━━━━━━━━━━━━━━━━━━━ 3s 2ms/step - accuracy: 1.0000 - loss: 1.6659e-09 Epoch 10/20 1819/1819 ━━━━━━━━━━━━━━━━━━━━ 3s 2ms/step - accuracy: 1.0000 - loss: 7.3756e-10 Epoch 11/20 1819/1819 ━━━━━━━━━━━━━━━━━━━━ 3s 2ms/step - accuracy: 1.0000 - loss: 3.1207e-10 Epoch 12/20 1819/1819 ━━━━━━━━━━━━━━━━━━━━ 3s 2ms/step - accuracy: 1.0000 - loss: 3.3202e-10 Epoch 13/20 1819/1819 ━━━━━━━━━━━━━━━━━━━━ 3s 2ms/step - accuracy: 1.0000 - loss: 1.0638e-10 Epoch 14/20 1819/1819 ━━━━━━━━━━━━━━━━━━━━ 3s 2ms/step - accuracy: 1.0000 - loss: 6.7138e-11 Epoch 15/20 1819/1819 ━━━━━━━━━━━━━━━━━━━━ 3s 2ms/step - accuracy: 1.0000 - loss: 6.5744e-11 Epoch 16/20 1819/1819 ━━━━━━━━━━━━━━━━━━━━ 3s 2ms/step - accuracy: 1.0000 - loss: 2.2968e-11 Epoch 17/20 1819/1819 ━━━━━━━━━━━━━━━━━━━━ 3s 2ms/step - accuracy: 1.0000 - loss: 2.0289e-11 Epoch 18/20 1819/1819 ━━━━━━━━━━━━━━━━━━━━ 3s 2ms/step - accuracy: 1.0000 - loss: 1.1217e-11 Epoch 19/20 1819/1819 ━━━━━━━━━━━━━━━━━━━━ 3s 2ms/step - accuracy: 1.0000 - loss: 1.0516e-11 Epoch 20/20 1819/1819 ━━━━━━━━━━━━━━━━━━━━ 3s 2ms/step - accuracy: 1.0000 - loss: 1.6686e-11
Out[37]:
<keras.src.callbacks.history.History at 0x2ea6a4da990>
In [39]:
pred = model.predict(X_test)
scores = model.evaluate(X_test, y_test, verbose=0)
print('Accuracy on test data: {}% \n Error on test data: {}'.format(scores[1], 1 - scores[1]))
896/896 ━━━━━━━━━━━━━━━━━━━━ 1s 1ms/step Accuracy on test data: 0.9999651312828064% Error on test data: 3.4868717193603516e-05
We will use SVM¶
Testing¶
In [40]:
test = pd.read_csv('Test_Data.csv')
test.head()
Out[40]:
duration | protocoltype | service | flag | srcbytes | dstbytes | land | wrongfragment | urgent | hot | ... | dsthostsrvcount | dsthostsamesrvrate | dsthostdiffsrvrate | dsthostsamesrcportrate | dsthostsrvdiffhostrate | dsthostserrorrate | dsthostsrvserrorrate | dsthostrerrorrate | dsthostsrvrerrorrate | lastflag | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | tcp | mtp | REJ | 0 | 0 | 0 | 0 | 0 | 0 | ... | 7 | 0.03 | 0.08 | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 1.0 | 20 |
1 | 0 | tcp | http | SF | 199 | 1721 | 0 | 0 | 0 | 0 | ... | 255 | 1.00 | 0.00 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 21 |
2 | 0 | tcp | discard | S0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 14 | 0.05 | 0.09 | 0.0 | 0.0 | 1.0 | 1.0 | 0.0 | 0.0 | 18 |
3 | 0 | tcp | telnet | S0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 2 | 0.01 | 0.09 | 0.0 | 0.0 | 1.0 | 1.0 | 0.0 | 0.0 | 18 |
4 | 0 | tcp | exec | S0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 16 | 0.06 | 0.06 | 0.0 | 0.0 | 1.0 | 1.0 | 0.0 | 0.0 | 20 |
5 rows × 42 columns
In [41]:
#ENCODING CATEGORICAL VALUES
test = pd.get_dummies(test, columns=["protocoltype"])
test.head()
Out[41]:
duration | service | flag | srcbytes | dstbytes | land | wrongfragment | urgent | hot | numfailedlogins | ... | dsthostsamesrcportrate | dsthostsrvdiffhostrate | dsthostserrorrate | dsthostsrvserrorrate | dsthostrerrorrate | dsthostsrvrerrorrate | lastflag | protocoltype_icmp | protocoltype_tcp | protocoltype_udp | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | mtp | REJ | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 1.0 | 20 | False | True | False |
1 | 0 | http | SF | 199 | 1721 | 0 | 0 | 0 | 0 | 0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 21 | False | True | False |
2 | 0 | discard | S0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0.0 | 0.0 | 1.0 | 1.0 | 0.0 | 0.0 | 18 | False | True | False |
3 | 0 | telnet | S0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0.0 | 0.0 | 1.0 | 1.0 | 0.0 | 0.0 | 18 | False | True | False |
4 | 0 | exec | S0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0.0 | 0.0 | 1.0 | 1.0 | 0.0 | 0.0 | 20 | False | True | False |
5 rows × 44 columns
In [42]:
test['protocoltype_icmp'] = test['protocoltype_icmp'].astype(int)
test['protocoltype_tcp'] = test['protocoltype_tcp'].astype(int)
test['protocoltype_udp'] = test['protocoltype_udp'].astype(int)
test.head()
Out[42]:
duration | service | flag | srcbytes | dstbytes | land | wrongfragment | urgent | hot | numfailedlogins | ... | dsthostsamesrcportrate | dsthostsrvdiffhostrate | dsthostserrorrate | dsthostsrvserrorrate | dsthostrerrorrate | dsthostsrvrerrorrate | lastflag | protocoltype_icmp | protocoltype_tcp | protocoltype_udp | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | mtp | REJ | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 1.0 | 20 | 0 | 1 | 0 |
1 | 0 | http | SF | 199 | 1721 | 0 | 0 | 0 | 0 | 0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 21 | 0 | 1 | 0 |
2 | 0 | discard | S0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0.0 | 0.0 | 1.0 | 1.0 | 0.0 | 0.0 | 18 | 0 | 1 | 0 |
3 | 0 | telnet | S0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0.0 | 0.0 | 1.0 | 1.0 | 0.0 | 0.0 | 18 | 0 | 1 | 0 |
4 | 0 | exec | S0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0.0 | 0.0 | 1.0 | 1.0 | 0.0 | 0.0 | 20 | 0 | 1 | 0 |
5 rows × 44 columns
In [43]:
service_test=test['service'].value_counts().to_dict()
In [44]:
test['service']=test['service'].map(service_test)
test.head()
Out[44]:
duration | service | flag | srcbytes | dstbytes | land | wrongfragment | urgent | hot | numfailedlogins | ... | dsthostsamesrcportrate | dsthostsrvdiffhostrate | dsthostserrorrate | dsthostsrvserrorrate | dsthostrerrorrate | dsthostsrvrerrorrate | lastflag | protocoltype_icmp | protocoltype_tcp | protocoltype_udp | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 81 | REJ | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 1.0 | 20 | 0 | 1 | 0 |
1 | 0 | 7843 | SF | 199 | 1721 | 0 | 0 | 0 | 0 | 0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 21 | 0 | 1 | 0 |
2 | 0 | 97 | S0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0.0 | 0.0 | 1.0 | 1.0 | 0.0 | 0.0 | 18 | 0 | 1 | 0 |
3 | 0 | 448 | S0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0.0 | 0.0 | 1.0 | 1.0 | 0.0 | 0.0 | 18 | 0 | 1 | 0 |
4 | 0 | 84 | S0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0.0 | 0.0 | 1.0 | 1.0 | 0.0 | 0.0 | 20 | 0 | 1 | 0 |
5 rows × 44 columns
In [45]:
flag_test=test['flag'].value_counts().to_dict()
test['flag']=test['flag'].map(flag_test)
test.head()
Out[45]:
duration | service | flag | srcbytes | dstbytes | land | wrongfragment | urgent | hot | numfailedlogins | ... | dsthostsamesrcportrate | dsthostsrvdiffhostrate | dsthostserrorrate | dsthostsrvserrorrate | dsthostrerrorrate | dsthostsrvrerrorrate | lastflag | protocoltype_icmp | protocoltype_tcp | protocoltype_udp | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 81 | 1643 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 1.0 | 1.0 | 20 | 0 | 1 | 0 |
1 | 0 | 7843 | 12722 | 199 | 1721 | 0 | 0 | 0 | 0 | 0 | ... | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 21 | 0 | 1 | 0 |
2 | 0 | 97 | 6907 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0.0 | 0.0 | 1.0 | 1.0 | 0.0 | 0.0 | 18 | 0 | 1 | 0 |
3 | 0 | 448 | 6907 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0.0 | 0.0 | 1.0 | 1.0 | 0.0 | 0.0 | 18 | 0 | 1 | 0 |
4 | 0 | 84 | 6907 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | ... | 0.0 | 0.0 | 1.0 | 1.0 | 0.0 | 0.0 | 20 | 0 | 1 | 0 |
5 rows × 44 columns
In [46]:
X_test = pd.DataFrame(scaler.transform(X_test))
In [47]:
X_test.head()
Out[47]:
0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | ... | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0.0 | -0.000090 | -3.388132e-21 | 8.924779e-13 | 0.000000e+00 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 1.00 | 0.02 | 0.0 | 0.0 | 0.0 | 0.0 | -0.0025 | 0.0 | 1.0 | 0.0 |
1 | 0.0 | -0.000088 | -3.388132e-21 | 5.482969e-15 | 2.590991e-12 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.00 | 0.00 | 0.0 | 0.0 | 0.0 | 0.0 | -0.0050 | 0.0 | 0.0 | 1.0 |
2 | 0.0 | -0.000063 | -3.388132e-21 | 3.252398e-14 | 1.688193e-11 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.04 | 0.04 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0000 | 0.0 | 1.0 | 0.0 |
3 | 0.0 | -0.000082 | -8.918618e-06 | 0.000000e+00 | 0.000000e+00 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.01 | 0.00 | 1.0 | 1.0 | 0.0 | 0.0 | -0.0075 | 0.0 | 1.0 | 0.0 |
4 | 0.0 | -0.000095 | -8.918618e-06 | 0.000000e+00 | 0.000000e+00 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | ... | 0.00 | 0.00 | 1.0 | 1.0 | 0.0 | 0.0 | -0.0050 | 0.0 | 1.0 | 0.0 |
5 rows × 44 columns
In [48]:
test.dtypes
Out[48]:
duration int64 service int64 flag int64 srcbytes int64 dstbytes int64 land int64 wrongfragment int64 urgent int64 hot int64 numfailedlogins int64 loggedin int64 numcompromised int64 rootshell int64 suattempted int64 numroot int64 numfilecreations int64 numshells int64 numaccessfiles int64 numoutboundcmds int64 ishostlogin int64 isguestlogin int64 count int64 srvcount int64 serrorrate float64 srvserrorrate float64 rerrorrate float64 srvrerrorrate float64 samesrvrate float64 diffsrvrate float64 srvdiffhostrate float64 dsthostcount int64 dsthostsrvcount int64 dsthostsamesrvrate float64 dsthostdiffsrvrate float64 dsthostsamesrcportrate float64 dsthostsrvdiffhostrate float64 dsthostserrorrate float64 dsthostsrvserrorrate float64 dsthostrerrorrate float64 dsthostsrvrerrorrate float64 lastflag int64 protocoltype_icmp int32 protocoltype_tcp int32 protocoltype_udp int32 dtype: object
In [52]:
y_pred = pd.DataFrame(svc.predict(X_test), columns=['attack'])
C:\Users\HP\anaconda3\Lib\site-packages\sklearn\base.py:439: UserWarning: X does not have valid feature names, but SVC was fitted with feature names warnings.warn(
In [53]:
y_pred.head()
Out[53]:
attack | |
---|---|
0 | 0 |
1 | 0 |
2 | 0 |
3 | 1 |
4 | 1 |
In [54]:
y_pred.to_csv('attack.csv', index=False, header=True)
In [ ]: